Introduction

This report analyzes Airbnb listings in Crete, Greece including price distribution, availability, and location trends.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(readr)

Load Airbnb dataset

Firstly, I loaded the Airbnb dataset, converted the date column to a date format and then cleaned the price column by removing symbols and converting it to numeric.

airbnb_calendar <- read.csv("/Users/christosfacondis/Downloads/calendar.csv")
airbnb_calendar$date<- as.Date(airbnb_calendar$date, format="%Y-%m-%d")
airbnb_calendar$price <- as.numeric(gsub("[$€,]", "", airbnb_calendar$price))
str(airbnb_calendar) 
## 'data.frame':    9545448 obs. of  7 variables:
##  $ listing_id    : num  8.94e+17 8.94e+17 8.94e+17 8.94e+17 8.94e+17 ...
##  $ date          : Date, format: "2024-12-29" "2024-12-30" ...
##  $ available     : chr  "f" "t" "t" "t" ...
##  $ price         : num  44 44 44 44 44 44 44 44 44 44 ...
##  $ adjusted_price: chr  "" "" "" "" ...
##  $ minimum_nights: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ maximum_nights: int  365 365 365 365 365 365 365 365 365 365 ...
summary(airbnb_calendar)
##    listing_id             date             available             price        
##  Min.   :2.797e+04   Min.   :2024-12-29   Length:9545448     Min.   :    0.0  
##  1st Qu.:2.807e+07   1st Qu.:2025-03-30   Class :character   1st Qu.:   69.0  
##  Median :5.313e+07   Median :2025-06-29   Mode  :character   Median :  110.0  
##  Mean   :4.572e+17   Mean   :2025-06-29                      Mean   :  285.6  
##  3rd Qu.:9.240e+17   3rd Qu.:2025-09-28                      3rd Qu.:  222.0  
##  Max.   :1.321e+18   Max.   :2025-12-29                      Max.   :75625.0  
##                                                                               
##  adjusted_price     minimum_nights    maximum_nights     
##  Length:9545448     Min.   :  1.000   Min.   :1.000e+00  
##  Class :character   1st Qu.:  2.000   1st Qu.:3.650e+02  
##  Mode  :character   Median :  3.000   Median :1.125e+03  
##                     Mean   :  4.577   Mean   :8.285e+04  
##                     3rd Qu.:  4.000   3rd Qu.:1.125e+03  
##                     Max.   :999.000   Max.   :2.147e+09  
##                     NA's   :4         NA's   :4

In this dataset i converted some columns into factor.

airbnb_listings = read.csv("/Users/christosfacondis/Downloads/listings.csv")
airbnb_listings$neighbourhood <- as.factor(airbnb_listings$neighbourhood)
airbnb_listings$room_type <- as.factor(airbnb_listings$room_type)
str(airbnb_listings) 
## 'data.frame':    26152 obs. of  18 variables:
##  $ id                            : num  8.94e+17 8.94e+17 8.94e+17 8.94e+17 8.94e+17 ...
##  $ name                          : chr  "Amaria Studio" "BH417 - R - Villa Chania" "Mylos home" "Gouves family house in Heraklion" ...
##  $ host_id                       : int  514918089 461193921 514261598 515317369 461193921 515332334 172461966 147757667 515377223 104219138 ...
##  $ host_name                     : chr  "Amalia" "The Best Homes Md Lp" "Ευα" "Dimitris" ...
##  $ neighbourhood_group           : logi  NA NA NA NA NA NA ...
##  $ neighbourhood                 : Factor w/ 24 levels "Αγίου Βασιλείου",..: 10 23 11 24 23 13 19 23 23 19 ...
##  $ latitude                      : num  35.3 35.5 35 35.3 35.5 ...
##  $ longitude                     : num  25.1 23.9 25.7 25.3 23.9 ...
##  $ room_type                     : Factor w/ 4 levels "Entire home/apt",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ price                         : int  45 NA NA 138 NA NA NA 62 NA 35 ...
##  $ minimum_nights                : int  2 7 2 3 3 4 4 2 3 2 ...
##  $ number_of_reviews             : int  12 0 6 0 0 3 6 14 7 6 ...
##  $ last_review                   : chr  "2024-10-28" "" "2024-08-29" "" ...
##  $ reviews_per_month             : num  0.63 NA 0.34 NA NA 0.16 0.36 0.76 0.36 0.83 ...
##  $ calculated_host_listings_count: int  2 183 1 1 183 1 3 2 1 2 ...
##  $ availability_365              : int  177 184 0 0 207 0 0 364 0 196 ...
##  $ number_of_reviews_ltm         : int  7 0 3 0 0 1 5 6 3 6 ...
##  $ license                       : chr  "2031039" "1042K91003049701" "00002080199" "00001245657" ...
summary(airbnb_listings)
##        id                name              host_id           host_name        
##  Min.   :2.797e+04   Length:26152       Min.   :    51279   Length:26152      
##  1st Qu.:2.807e+07   Class :character   1st Qu.: 55004122   Class :character  
##  Median :5.313e+07   Mode  :character   Median :184448975   Mode  :character  
##  Mean   :4.572e+17                      Mean   :231143302                     
##  3rd Qu.:9.240e+17                      3rd Qu.:414440198                     
##  Max.   :1.321e+18                      Max.   :668568866                     
##                                                                               
##  neighbourhood_group        neighbourhood     latitude       longitude    
##  Mode:logical        Χανίων        :5785   Min.   :34.83   Min.   :23.53  
##  NA's:26152          Ρεθύμνης      :3004   1st Qu.:35.28   1st Qu.:24.02  
##                      Χερσονήσου    :2152   Median :35.36   Median :24.49  
##                      Ηρακλείου     :2091   Mean   :35.35   Mean   :24.62  
##                      Αποκορώνου    :1758   3rd Qu.:35.50   3rd Qu.:25.14  
##                      Αγίου Νικολάου:1491   Max.   :35.59   Max.   :26.28  
##                      (Other)       :9871                                  
##            room_type         price         minimum_nights    number_of_reviews
##  Entire home/apt:23518   Min.   :   10.0   Min.   :  1.000   Min.   :  0.00   
##  Hotel room     :  266   1st Qu.:   65.0   1st Qu.:  2.000   1st Qu.:  1.00   
##  Private room   : 2353   Median :  100.0   Median :  3.000   Median :  7.00   
##  Shared room    :   15   Mean   :  250.7   Mean   :  5.429   Mean   : 19.48   
##                          3rd Qu.:  200.0   3rd Qu.:  4.000   3rd Qu.: 23.00   
##                          Max.   :72942.0   Max.   :999.000   Max.   :605.00   
##                          NA's   :2998                                         
##  last_review        reviews_per_month calculated_host_listings_count
##  Length:26152       Min.   : 0.010    Min.   :  1.00                
##  Class :character   1st Qu.: 0.170    1st Qu.:  1.00                
##  Mode  :character   Median : 0.380    Median :  4.00                
##                     Mean   : 0.585    Mean   : 25.87                
##                     3rd Qu.: 0.770    3rd Qu.: 12.00                
##                     Max.   :10.790    Max.   :283.00                
##                     NA's   :5131                                    
##  availability_365 number_of_reviews_ltm   license         
##  Min.   :  0.0    Min.   :  0.000       Length:26152      
##  1st Qu.:144.0    1st Qu.:  0.000       Class :character  
##  Median :225.0    Median :  2.000       Mode  :character  
##  Mean   :215.2    Mean   :  4.543                         
##  3rd Qu.:322.0    3rd Qu.:  6.000                         
##  Max.   :365.0    Max.   :141.000                         
## 
airbnb_neighbourhoods = read.csv("/Users/christosfacondis/Downloads/neighbourhoods.csv")
str(airbnb_neighbourhoods) 
## 'data.frame':    24 obs. of  2 variables:
##  $ neighbourhood_group: logi  NA NA NA NA NA NA ...
##  $ neighbourhood      : chr  "Αγίου Βασιλείου" "Αγίου Νικολάου" "Αμάριου" "Ανωγείων" ...
summary(airbnb_neighbourhoods)
##  neighbourhood_group neighbourhood     
##  Mode:logical        Length:24         
##  NA's:24             Class :character  
##                      Mode  :character
airbnb_reviews = read.csv("/Users/christosfacondis/Downloads/reviews.csv")
str(airbnb_reviews) 
## 'data.frame':    509419 obs. of  2 variables:
##  $ listing_id: num  27966 27966 27966 27966 27966 ...
##  $ date      : chr  "2011-09-02" "2012-04-06" "2012-07-05" "2012-08-04" ...
summary(airbnb_reviews)
##    listing_id            date          
##  Min.   :2.797e+04   Length:509419     
##  1st Qu.:1.614e+07   Class :character  
##  Median :2.762e+07   Mode  :character  
##  Mean   :1.493e+17                     
##  3rd Qu.:4.965e+07                     
##  Max.   :1.317e+18

Question 1: Which part of Crete has the most Airbnbs?

I began by organising the listings data by neighbourhood to identify the areas with the most listings. I then created a plot to visualise the neighbourhoods with the most Airbnb listings.

neighbourhood_density =   airbnb_listings %>%
  group_by(neighbourhood) %>%
  summarise(total_listings = n()) %>%
  arrange(desc(total_listings))
head(neighbourhood_density)
## # A tibble: 6 × 2
##   neighbourhood  total_listings
##   <fct>                   <int>
## 1 Χανίων                   5785
## 2 Ρεθύμνης                 3004
## 3 Χερσονήσου               2152
## 4 Ηρακλείου                2091
## 5 Αποκορώνου               1758
## 6 Αγίου Νικολάου           1491
ggplot(neighbourhood_density[1:10, ], aes(x=reorder(neighbourhood, total_listings), y=total_listings)) +
  geom_bar(stat="identity", fill="skyblue", alpha=0.8) + 
  coord_flip() +  
  labs(title="Top 10 Neighbourhoods with Most Airbnb Listings",
       x="Neighbourhood", y="Number of Listings") +
  theme_minimal() +  
  theme(axis.text.x = element_text(size=12), 
        axis.text.y = element_text(size=12),
        plot.title = element_text(size=14, face="bold"))

Question 2: What is the mean price for each neighbourhood?

Firstly, i cleaned my data from NAs,zeros and not finite values. Then, I grouped the prices of the listings by neighbourhood. I then requested the following aggregations (mean, median, standard deviation, size) for each neighbourhood. Finally, I sorted the calculated data by median in descending order beacuse Airbnb prices don’t follow a normal distribution.

sum(is.na(airbnb_listings$price))  
## [1] 2998
sum(is.nan(airbnb_listings$price)) 
## [1] 0
sum(is.infinite(airbnb_listings$price)) 
## [1] 0
sum(airbnb_listings$price <= 0, na.rm=TRUE) 
## [1] 0
airbnb_listings<- airbnb_listings %>%
  mutate(price = as.numeric(price)) %>% 
  filter(!is.na(price) & !is.nan(price) & is.finite(price) & price > 0)  


prices_by_neighbourhood <- airbnb_listings %>%
  group_by(neighbourhood) %>%
  summarise(
    mean_price = mean(price, na.rm = TRUE),
    median_price = median(price, na.rm = TRUE),
    sd_price = sd(price, na.rm = TRUE),
    total_listings = n()
  ) %>%
  arrange(desc(median_price)) 

head(prices_by_neighbourhood, 10)
## # A tibble: 10 × 5
##    neighbourhood   mean_price median_price sd_price total_listings
##    <fct>                <dbl>        <dbl>    <dbl>          <int>
##  1 Μυλοποτάμου           425.          198    1223.            665
##  2 Αποκορώνου            287.          170     571.           1596
##  3 Ρεθύμνης              283.          132     779.           2636
##  4 Πλατανιά              270.          130     905.           1147
##  5 Αγίου Βασιλείου       208.          109     634.           1044
##  6 Κισσάμου              352.          109    1030.           1282
##  7 Αγίου Νικολάου        391.          100    1251.           1324
##  8 Φαιστού               206.          100     798.            932
##  9 Χερσονήσου            357.          100    1478.           1895
## 10 Χανίων                205.           97    1146.           4996

I selected the top 10 most expensive neighbourhoods and created a dot-and-line plot. I plotted the median price in red and the mean price in blue, connecting them with a dashed gray line.

expensive_neighbourhoods <- prices_by_neighbourhood[1:10, ]

# Create a dot + line plot with both median and mean prices
ggplot(expensive_neighbourhoods, aes(x=reorder(neighbourhood, median_price))) +
  geom_point(aes(y=median_price), color="red", size=5, alpha=0.8) +  
  geom_point(aes(y=mean_price), color="blue", size=5, alpha=0.8) +   
  geom_segment(aes(y=median_price, yend=mean_price, xend=neighbourhood), color="gray", linetype="dashed") + # Line connecting mean and median
  coord_flip() +  # Flip for better readability
  labs(title="Top 10 Neighbourhoods by Airbnb Price (Median vs Mean)",
       subtitle="Red = Median Price | Blue = Mean Price",
       x="Neighbourhood",
       y="Price") +
  theme_minimal() + 
  theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12),
        plot.title = element_text(size=14, face="bold"),
        plot.subtitle = element_text(size=12, face="italic"))

Question 3: Histogram of Airbnb Prices in Crete with Mean and Median Lines

I aimed to visualize the distribution of Airbnb listing prices in Crete by creating a histogram that displays the frequency of different price amounts while also including reference lines for the mean and median prices.

airbnb_listings <- airbnb_listings %>%
  mutate(price = as.numeric(price)) %>%  # Convert to numeric
  filter(!is.na(price) & !is.nan(price) & is.finite(price) & price > 0) 

ggplot(airbnb_listings, aes(x=price)) +
  geom_histogram(binwidth=10, fill="navy", color="black", alpha=0.7) +  
  geom_vline(aes(xintercept=mean(price, na.rm=TRUE)), color="red", linetype="solid", size=1.2) +  
  geom_vline(aes(xintercept=median(price, na.rm=TRUE)), color="black", linetype="solid", size=1.2) +  
  scale_x_continuous(limits = c(0, quantile(airbnb_listings$price, 0.95, na.rm=TRUE)), breaks=seq(0, max(airbnb_listings$price, na.rm=TRUE), by=100)) +  # Limit x-axis to remove extreme outliers
  labs(title="Histogram of Airbnb Prices in Greece",
       x="Price Amounts", y="Frequency",
       subtitle="Red Line = Mean Price | Black Line = Median Price") +
  theme_minimal() +  # Clean and modern theme
  theme(axis.text.x = element_text(size=12),
        axis.text.y = element_text(size=12),
        plot.title = element_text(size=14, face="bold"),
        plot.subtitle = element_text(size=12, face="italic"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1120 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

###Question 3b Average price per Date

I calculated the average price per date by grouping the data by date and computing the mean price. Then, I created a line plot, where the average price. I formatted the x-axis to display monthly date labels.

 avg_price_per_date <- airbnb_calendar %>%
  group_by(date) %>%
  summarise(AvgPrice = mean(price, na.rm=TRUE)) %>%
  arrange(date)

ggplot(avg_price_per_date, aes(x = as.Date(date), y = AvgPrice)) +
  geom_line(color="blue", size=1) 

  labs(title = "Average Price Per Date",
       x = "Date",
       y = "Average Price (€)") +
  theme_minimal() +  # Clean and modern theme
  theme(axis.text.x = element_text(size=12, angle=45, hjust=1),  # Rotate x-axis labels
        axis.text.y = element_text(size=12),
        plot.title = element_text(size=14, face="bold")) +
  scale_x_date(date_labels = "%b %d", date_breaks = "1 month")  
## NULL

Question 4: Availability per Day

I aimed to analyze and visualize the availability of Airbnb listings per day by filtering and counting available listings, grouping them by date, and plotting the results. When plotting this available, i observe that the most available listings can be found May to October.

availability_per_date <- airbnb_calendar %>%
  filter(available == 't') %>%  # Keep only available listings
  group_by(date) %>%  # Group by date
  summarise(no_listings_available = n()) %>%  # Count available listings per day
  arrange(date)  # Sort by date

ggplot(availability_per_date, aes(x = as.Date(date), y = no_listings_available)) +
  geom_line(color="blue", size=1) +  # Line plot with blue color
  labs(title = "Available Listings per Date",
       x = "Date",
       y = "Number of Available Listings") +
  theme_minimal() 

Question 5: Histogram of Number of Reviews per Listing

I aimed to analyze and visualize the distribution of reviews per Airbnb listing in Crete by creating a histogram that displays the frequency of different review counts while also including reference lines for the mean and median.

mean_reviews <- mean(airbnb_listings$number_of_reviews, na.rm=TRUE)
median_reviews <- median(airbnb_listings$number_of_reviews, na.rm=TRUE)

ggplot(airbnb_listings, aes(x=number_of_reviews)) +
  geom_histogram(bins=200, fill="navy", color="black", alpha=0.7) +  
  geom_vline(aes(xintercept=mean_reviews), color="red", linewidth=1.2) +  
  geom_vline(aes(xintercept=median_reviews), color="black", linewidth=1.2) +  
  scale_x_continuous(limits=c(0,90), breaks=seq(0, 90, 10)) +  
  labs(title="Histogram of Number of Reviews per listing",
       x="Review Amounts",
       y="Frequency") +
  theme_minimal() +  
  theme(plot.title = element_text(size=14, face="bold"),
        axis.title.x = element_text(size=12, face="bold"),
        axis.title.y = element_text(size=12, face="bold"),
        axis.text.x = element_text(angle=45, hjust=1)) +  
  guides(fill="none") + 
  annotate("text", x=mean_reviews, y=max(table(airbnb_listings$number_of_reviews)), label="Mean", color="red", hjust=-0.2) +
  annotate("text", x=median_reviews, y=max(table(airbnb_listings$number_of_reviews))-10, label="Median", color="black", hjust=-0.2)
## Warning: Removed 940 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Question 6: Distribution of Airbnb Room Types in Crete

I analyzed the distribution of different Airbnb room types in Crete to understand how prices vary and how common each room type is. I used a bar plot, violin plot, and frequency analysis to explore these insights.

df_room_type <- airbnb_listings %>%
  group_by(room_type) %>%
  summarise(no_listings_per_room_type = n()) %>%
  arrange(desc(no_listings_per_room_type))

total_listings <- sum(df_room_type$no_listings_per_room_type)
df_room_type <- df_room_type %>%
  mutate(percentage = round((no_listings_per_room_type / total_listings) * 100, 2))

df_room_type$percentage <- paste0(df_room_type$percentage, "%")

print(df_room_type)
## # A tibble: 4 × 3
##   room_type       no_listings_per_room_type percentage
##   <fct>                               <int> <chr>     
## 1 Entire home/apt                     21067 90.99%    
## 2 Private room                         1864 8.05%     
## 3 Hotel room                            210 0.91%     
## 4 Shared room                            13 0.06%
ggplot(df_room_type, aes(x=reorder(room_type, -no_listings_per_room_type), 
                         y=no_listings_per_room_type, 
                         fill=room_type)) +
  geom_bar(stat="identity", alpha=0.8) + 
  geom_text(aes(label=percentage), vjust=-0.5, size=2) +  
  labs(title="Number of Listings per Room Type",
       x="Room Type",
       y="Number of Listings") +
  theme_minimal() +  # Clean theme
  theme(axis.text.x = element_text(size=10),
        axis.text.y = element_text(size=10),
        plot.title = element_text(size=10, face="bold")) +
  scale_fill_manual(values=c("#FFB6C1", "#ADD8E6", "#FFDDC1", "#C1E1C1")) 

listings <- airbnb_listings %>%
  mutate(price = as.numeric(price)) %>%
  filter(!is.na(price) & price > 0 & price < quantile(price, 0.95, na.rm=TRUE)) 

ggplot(listings, aes(x=room_type, y=price, fill=room_type)) +
  geom_violin(alpha=0.7, trim=FALSE, color="black") +  # Violin plot with border
  geom_boxplot(width=0.1, fill="white", alpha=0.5, outlier.shape=NA) +  
  labs(title="Price Distribution by Room Type",
       x="Room Type",
       y="Price") +
  theme_minimal() +  
  theme(axis.text.x = element_text(size=12, angle=45, hjust=1),  
        axis.text.y = element_text(size=12),
        plot.title = element_text(size=14, face="bold")) +
  scale_fill_manual(values=c("#FFB6C1", "#ADD8E6", "#FFDDC1", "#C1E1C1")) 

Question 7:Interactive Map of Airbnb Listings in Crete

I aimed to analyze and visualize the geographic distribution of Airbnb listings in Crete using an interactive map. This map allows users to explore locations dynamically, view listing details, and see how room types are distributed geographically.

library(leaflet)
library(RColorBrewer)

room_colors <- colorFactor(
  palette = c("red", "blue", "green", "purple"),  
  domain = airbnb_listings$room_type)

  leaflet(airbnb_listings) %>%
  addTiles() %>% 
  addCircleMarkers(
    ~longitude, ~latitude, 
    color = ~room_colors(room_type),  
    radius = 3, opacity = 0.7, fillOpacity = 0.5,
    popup = ~paste("<b>Room Type:</b>", room_type, 
                   "<br><b>Price:</b>", price, "€",
                   "<br><b>Reviews:</b>", number_of_reviews)
  ) %>%
  addLegend("bottomright", 
            pal = room_colors, values = ~room_type, 
            title = "Room Type")

Question 8: Listings per host

In this question, i have checked out how many listings are per host.

listings_per_host <- airbnb_listings %>%
  group_by(host_id, host_name) %>%
  summarise(no_of_listings = n()) %>%
  arrange(desc(no_of_listings))
## `summarise()` has grouped output by 'host_id'. You can override using the
## `.groups` argument.
head(listings_per_host)
## # A tibble: 6 × 3
## # Groups:   host_id [6]
##     host_id host_name            no_of_listings
##       <int> <chr>                         <int>
## 1   4301312 Valia                           276
## 2  12389816 Kostas                          259
## 3  61036077 Stratos                         222
## 4  40567005 Emmanuel & Yannis               179
## 5 461193921 The Best Homes Md Lp            153
## 6   9114389 Antonis                         141